#load libraries and required ASER datasets
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(knitr)
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.8.5, PROJ 9.5.1; sf_use_s2() is TRUE
library(ggplot2)
setwd("/Users/Arunima/Desktop/D3SR")
aser_school_2011 <- read.csv("/Users/Arunima/Desktop/D3SR/ASER School Data 2007-18/ASER 2011/ASER2011-School.txt")
aser_household_2011 <- read.csv("/Users/Arunima/Desktop/D3SR/ASER Household_Village Data 2007-18/ASER 2011/ASER 2011 Household Data.txt")
#select the required variables from the ASER 2011 household dataset
aser_household_2011 <- aser_household_2011 %>%
group_by(state_name, district_name)
aser_hh_selected <- aser_household_2011 %>%
select(state_code, district_code, state_name, district_name, hh_id, child_age, child_gender, hh_tv, hh_mobile, hh_toilet, hh_electricity_conn, hh_type, school_class, oos_never_enr, oos_dropout, oos_dropout_class, school_govt, school_private, school_madarsa, school_other, tuition, read_nothing, read_letter, read_word, read_level_1, read_level_2, math_nothing, math_num_1_9, math_num_10_99, math_subtraction, math_division, father_class, hh_multiplier, mother_age, mother_class, vlg_pucca_road, vlg_electricity, vlg_std_booth, vlg_bank, vlg_private_health_clinic, vlg_internet_cafe, vlg_solar_energy, vlg_private_school)
#select the required variables from the ASER 2011 school level dataset
aser_school_2011 <- aser_school_2011 %>%
group_by(state_name, district_name)
aser_school_selected <- aser_school_2011 %>%
select(state_code, district_code, state_name, district_name, schtype_class, childenrollment_class_1, childenrollment_class_2, childenrollment_class_3, childenrollment_class_4, childenrollment_class_5, childenrollment_class_6, childenrollment_class_7, childenrollment_class_8, regulargovtteacher_appointment, midday_meal_in_school, boundary_wall, computer_in_school, computer_in_school_usable, girls_toilet_in_school, boys_toilet_in_school)
#Creating Indicators
##Create an indicator for Arithematic Score based on the level of learning
aser_hh_selected <- aser_hh_selected %>%
mutate(math_score = case_when(
math_division == 1 ~ 4,
math_subtraction == 1 ~ 3,
math_num_10_99 == 1 ~ 2,
math_num_1_9 == 1 ~ 1,
math_nothing == 1 ~ 0,
TRUE ~ NA_real_
)) %>%
mutate(math_score_scaled = ifelse(!is.na(math_score), math_score / 4, NA)) # scales down the indicator to 4
## Create an indicator for Reading Scores based on the ability to read. This creates an ordinal variable
aser_hh_selected <- aser_hh_selected %>%
mutate(reading_score = case_when(
read_level_2 == 1 ~ 4,
read_level_1 == 1 ~ 3,
read_word == 1 ~ 2,
read_letter == 1 ~ 1,
read_nothing == 1 ~ 0,
TRUE ~ NA_real_
))%>%
mutate(reading_score_scaled = ifelse(!is.na(reading_score), reading_score / 4, NA)) %>% # scales down the indicator to 4
mutate(composite_score= (reading_score_scaled+math_score_scaled)/2) # creating a new variable called composite score to know the education outcome
#create a variable for school enrollment
aser_hh_selected <- aser_hh_selected %>%
mutate(enrolled_any_school = case_when(
oos_dropout == 0 ~ 0,
school_govt == 1 | school_private == 1 | school_madarsa == 1 | school_other == 1 ~ 1,
TRUE ~ 0
))
## creating a binary variable for dropout (Dropout = 1, not dropout = 0)
aser_hh_selected <- aser_hh_selected %>%
mutate(dropout = if_else(is.na(oos_dropout), 1, 0))
#creating category variables for mother's level of education according to ASER's manual
aser_hh_selected$mother_education_cat <- cut(
aser_hh_selected$mother_class,
breaks = c(-1, 0, 5, 8, 10, 12, 15, 17, 18),
labels = c("No formal education",
"Primary (1–5)",
"Upper Primary (6–8)",
"Secondary (9–10)",
"Higher Secondary (11–12)",
"Bachelors (13–15)",
"Postgraduate (16–17)",
"Diploma (18)"),
right = TRUE
)
library(knitr) #loading knitr to use kable function
# View the category distribution in table format
kable(table(aser_hh_selected$mother_education_cat))
| Var1 | Freq |
|---|---|
| No formal education | 296 |
| Primary (1–5) | 91820 |
| Upper Primary (6–8) | 77547 |
| Secondary (9–10) | 66336 |
| Higher Secondary (11–12) | 22377 |
| Bachelors (13–15) | 7913 |
| Postgraduate (16–17) | 1620 |
| Diploma (18) | 533 |
## Creating Household Characteristics Index to control for any confounding variables in the environment
aser_hh_selected <- aser_hh_selected %>%
ungroup() %>%
mutate(
hh_tv = as.numeric(ifelse(hh_tv == 1, 1,
ifelse(hh_tv == 2, 0, NA))),
hh_mobile = as.numeric(ifelse(hh_mobile == 1, 1,
ifelse(hh_mobile == 2, 0, NA))),
hh_toilet = as.numeric(ifelse(hh_toilet == 1, 1,
ifelse(hh_toilet == 2, 0, NA))),
hh_electricity_conn = as.numeric(ifelse(hh_electricity_conn == 1, 1,
ifelse(hh_electricity_conn == 2, 0, NA))),
# Scaling hh_type from 1–3 to 0–1 and creating a column for the same
hh_type_scaled = ifelse(hh_type %in% 1:3, (hh_type - 1) / 2, NA)
) %>%
mutate(
household_characteristics_index = rowMeans(select(.,
hh_tv,
hh_mobile,
hh_electricity_conn,
hh_type_scaled
), na.rm = TRUE)
)
# creating a new column avg_household_characteristics_index to calculate household characteristics at district level
household_characteristics_index <- aser_hh_selected %>%
mutate(district_name = tolower(trimws(district_name))) %>%
group_by(district_name) %>%
summarise(
avg_household_characteristics_index = mean(household_characteristics_index, na.rm = TRUE),
.groups = 'drop'
)
#Aggregating data to the district level using household multiplier which indicates the number of households the sampled household represents in the population of the district and creating a new dataframe
average_HH_district_summary <- aser_hh_selected %>%
mutate(district_name = tolower(trimws(district_name))) %>%
group_by(district_name) %>%
summarise(
avg_reading = weighted.mean(reading_score_scaled, weights = hh_multiplier, na.rm = TRUE),
avg_arithmetic = weighted.mean(math_score_scaled, weights = hh_multiplier, na.rm = TRUE),
avg_child_edu = weighted.mean(composite_score, weights = hh_multiplier, na.rm = TRUE),
avg_enrollment = weighted.mean(enrolled_any_school, weights = hh_multiplier, na.rm = TRUE),
avg_paternal_edu = weighted.mean(father_class, weights = hh_multiplier, na.rm = TRUE),
avg_maternal_edu = weighted.mean(mother_class, weights = hh_multiplier, na.rm = TRUE),
avg_enrollment_ratio = weighted.mean(enrolled_any_school, weights = hh_multiplier, na.rm = TRUE),
avg_dropout = weighted.mean(dropout, weights = hh_multiplier, na.rm = TRUE),
avg_mother_class = weighted.mean(mother_class, weights = hh_multiplier, na.rm = TRUE))
# merging the district characteristics and household characteristics data frames by district name to make a new data frame called household_dist_summary
household_dist_summary<- left_join(average_HH_district_summary,household_characteristics_index, by = "district_name")
##ASER SCHOOL LEVEL DATA 2011
#aggregating child primary school enrolment across classes 1 to 8 to know total enrollment per school
aser_school_selected <- aser_school_selected %>%
ungroup() %>%
mutate(childenrolment = rowSums(select(., childenrollment_class_1:childenrollment_class_8), na.rm = TRUE))
#creating a new variable to measure teacher student ratio based on the number of appointed teachers and child enrollment in a school
aser_school_selected <- aser_school_selected %>%
mutate(teacher_student_ratio = regulargovtteacher_appointment/childenrolment)
#creating School Quality Index to control for the confounders due school infrastructure
aser_school_selected <- aser_school_selected %>%
mutate(
midday_meal_in_school = as.numeric(ifelse(midday_meal_in_school == 1, 1,
ifelse(midday_meal_in_school == 2, 0, NA))),
boundary_wall = as.numeric(ifelse(boundary_wall == 1, 1,
ifelse(boundary_wall == 2, 0, NA))),
computer_in_school = as.numeric(ifelse(computer_in_school == 1, 1,
ifelse(computer_in_school == 2, 0, NA))),
computer_in_school_usable = as.numeric(ifelse(computer_in_school_usable == 1, 1,
ifelse(computer_in_school_usable == 2, 0, NA))),
girls_toilet_in_school = as.numeric(ifelse(girls_toilet_in_school == 1, 1,
ifelse(girls_toilet_in_school == 2, 0, NA))),
boys_toilet_in_school = as.numeric(ifelse(boys_toilet_in_school == 1, 1,
ifelse(boys_toilet_in_school == 2, 0, NA)))
) %>%
mutate(
school_quality_index = rowMeans(select(., #creating a new column for school quality index
midday_meal_in_school,
boundary_wall,
computer_in_school,
computer_in_school_usable,
girls_toilet_in_school,
boys_toilet_in_school
), na.rm = TRUE)
)
#creating a new dataframe to average school quality characteristics at district level
school_quality_index <- aser_school_selected %>%
mutate(district_name = tolower(trimws(district_name))) %>%
group_by(district_name) %>%
summarise(
avg_school_quality_index = mean(school_quality_index, na.rm = TRUE),
childenrolment = mean(childenrolment, na.rm = TRUE),
.groups = 'drop'
)
#creating a new data frame for teacher student ratio
TSR_DF <- aser_school_selected %>%
mutate(district_name = tolower(trimws(district_name))) %>%
group_by(district_name) %>% # or whatever your district ID column is
summarise(
total_students = sum(childenrolment, na.rm = TRUE),
total_teachers = sum(regulargovtteacher_appointment , na.rm = TRUE), # assuming 'teachers' is your column
district_tsr = total_students / total_teachers,
.groups = 'drop'
)
#merging all the school level data frames by district name to consolidate into one dataframe
merged_school <- left_join(school_quality_index,TSR_DF, by = "district_name")
#merging both household level and school level data frames to create one data frame for ASER data
merged_aser <- left_join(household_dist_summary,merged_school, by = "district_name")
#Loading PC11 Data
pca11 <- read.csv("/Users/Arunima/Desktop/D3SR/shrug-pca11-csv/pc11_pca_clean_pc11dist.csv")
#adding state names
state_ids <- read.csv("/Users/Arunima/Downloads/state_key.csv")
pca11 <- merge(pca11, state_ids, by = "pc11_state_id", all = TRUE)
#adding district names
district_ids <- read.csv("/Users/Arunima/Downloads/dis_key.csv")
pca11 <- merge(pca11, district_ids, by = "pc11_district_id", all = TRUE)
#selecting required variables and making two new variables
pca11 <- pca11 %>%
select(pc11_state_id, pc11_district_id, pc11_pca_tot_p, pc11_pca_p_sc, pc11_pca_p_st, pc11_pca_tot_f, pc11_pca_f_lit, pc11_pca_tot_work_p, pc11_pca_tot_work_f, pc11_pca_non_work_f, pc11_pca_non_work_p, pc11_pca_no_hh, district_name, state_name) %>%
mutate(female_lfp = ((1 - (pc11_pca_non_work_f / pc11_pca_tot_f))*100) )%>% #creating a new variable for female labour force participation
mutate(SC_share = (pc11_pca_p_sc / pc11_pca_tot_p)* 100) %>% #creating a new variable to know the proportion of SC population as a proportion of the total population
mutate(ST_share = (pc11_pca_p_st / pc11_pca_tot_p)* 100) #creating a new variable to know the proportion of ST population as a proportion of the total population
#merging the Population Census Abstract 2011 data frame with the ASER data frame
merged <- left_join(pca11, merged_aser, by = "district_name")
# Loading the Night line data for urban controls
night_light_dist <- read.csv("/Users/Arunima/Desktop/D3SR/shrug-viirs-annual-csv/viirs_annual_pc11dist.csv")%>%
group_by(pc11_district_id) %>%
summarise(viirs_annual_mean = mean(viirs_annual_mean, na.rm = TRUE), .groups = 'drop') #summarising the data at district level
#merging the above data frame with night time light data
merged2 <- left_join(merged, night_light_dist, by = "pc11_district_id",relationship="many-to-many")
# saving the cv in desktop for backup
write.csv(merged2, "/Users/Arunima/Desktop/D3SR/merged2.csv", row.names = FALSE)
# Defining a vector of variable names that we want to work with
vars_used <- c("avg_arithmetic", "avg_maternal_edu", "female_lfp",
"viirs_annual_mean", "avg_paternal_edu", "SC_share", "ST_share",
"district_tsr", "avg_school_quality_index", "avg_household_characteristics_index")
# Creating a new data frame that selects only the variables in vars_used then filters rows where any of the numeric variables have NA, NaN, or infinite values
merged2_check <- merged2 %>%
select(all_of(vars_used)) %>%
filter_if(is.numeric, any_vars(is.na(.) | is.nan(.) | is.infinite(.)))
# creates a data frame without NA values
merged2_clean <- merged2 %>%
filter(
if_all(any_of(vars_used), ~ !is.na(.) & !is.nan(.) & is.finite(.))
)
# Cleaning and transforming village-level indicators into binary variables
merged3_clean <- aser_hh_selected %>%
mutate(
vlg_pucca_road = ifelse(vlg_pucca_road == 1, 1,
ifelse(vlg_pucca_road == 2, 0, NA)),
vlg_electricity = ifelse(vlg_electricity == 1, 1,
ifelse(vlg_electricity == 2, 0, NA)),
vlg_std_booth = ifelse(vlg_std_booth == 1, 1,
ifelse(vlg_std_booth == 2, 0, NA)),
vlg_bank = ifelse(vlg_bank == 1, 1,
ifelse(vlg_bank == 2, 0, NA)),
vlg_private_health_clinic = ifelse(vlg_private_health_clinic == 1, 1,
ifelse(vlg_private_health_clinic == 2, 0, NA)),
vlg_internet_cafe = ifelse(vlg_internet_cafe == 1, 1,
ifelse(vlg_internet_cafe == 2, 0, NA)),
vlg_solar_energy = ifelse(vlg_solar_energy == 1, 1,
ifelse(vlg_solar_energy == 2, 0, NA)),
vlg_private_school = ifelse(vlg_private_school == 1, 1,
ifelse(vlg_private_school == 2, 0, NA))
) %>%
# Creating a new variable district_development_index by averaging the binary indicators
mutate(
district_development_index = rowMeans(select(.,
vlg_pucca_road,
vlg_electricity,
vlg_std_booth,
vlg_bank,
vlg_private_health_clinic,
vlg_internet_cafe,
vlg_solar_energy,
vlg_private_school
), na.rm = TRUE)
)
## aggregating to district level by calculating the average index value per district as a new variable
district_characteristics_index <- merged3_clean %>%
mutate(district_name = tolower(trimws(district_name))) %>%
group_by(district_name) %>%
summarise(
avg_district_characteristics_index = mean(district_development_index, na.rm = TRUE),
.groups = 'drop'
)
# merging the district-level development index into the cleaned merged2 data frame by district name
merged2_clean <- merged2_clean %>%
left_join(district_characteristics_index, by = "district_name" )
#running the preliminary regression with the dependent variable (avg_child_edu), independent variable (avg_maternal_edu) and all the controld
model_11 <- lm(
avg_child_edu ~ avg_maternal_edu + female_lfp +
viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
district_tsr + avg_school_quality_index + avg_household_characteristics_index + avg_district_characteristics_index,
data = merged2_clean
)
summary(model_11)
##
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp +
## viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
## district_tsr + avg_school_quality_index + avg_household_characteristics_index +
## avg_district_characteristics_index, data = merged2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.213758 -0.041057 0.001879 0.041641 0.180013
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.735e-01 3.585e-02 7.627 1.60e-13 ***
## avg_maternal_edu 3.154e-02 4.929e-03 6.398 4.17e-10 ***
## female_lfp 1.249e-03 3.508e-04 3.561 0.000412 ***
## viirs_annual_mean -5.212e-03 2.555e-03 -2.040 0.041924 *
## avg_paternal_edu -6.773e-03 5.124e-03 -1.322 0.186899
## SC_share -3.457e-04 4.604e-04 -0.751 0.453101
## ST_share -3.742e-05 1.943e-04 -0.193 0.847361
## district_tsr -1.461e-05 3.807e-05 -0.384 0.701312
## avg_school_quality_index 3.483e-02 2.565e-02 1.358 0.175247
## avg_household_characteristics_index 2.504e-01 2.714e-02 9.226 < 2e-16 ***
## avg_district_characteristics_index -3.115e-02 3.472e-02 -0.897 0.370120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06513 on 424 degrees of freedom
## Multiple R-squared: 0.5363, Adjusted R-squared: 0.5254
## F-statistic: 49.04 on 10 and 424 DF, p-value: < 2.2e-16
# Running a linear regression model (Model 1) with average maternal education as the only predictor of average child education
model_1 <- lm(
avg_child_edu ~ avg_maternal_edu,
data = merged2_clean
)
summary(model_1)
##
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu, data = merged2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.252287 -0.053027 0.002396 0.056917 0.201797
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.288183 0.026625 10.82 <2e-16 ***
## avg_maternal_edu 0.047136 0.003546 13.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07976 on 433 degrees of freedom
## Multiple R-squared: 0.2898, Adjusted R-squared: 0.2882
## F-statistic: 176.7 on 1 and 433 DF, p-value: < 2.2e-16
#Running Model 2 by adding female literacy rate and female labor force participation as controls to the basic model
model_2 <- lm(
avg_child_edu ~ avg_maternal_edu + female_lfp,
data = merged2_clean
)
summary(model_2)
##
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp, data = merged2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.219101 -0.053890 0.001482 0.051807 0.196542
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2084250 0.0285878 7.291 1.48e-12 ***
## avg_maternal_edu 0.0502205 0.0034373 14.610 < 2e-16 ***
## female_lfp 0.0019861 0.0003198 6.210 1.25e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07651 on 432 degrees of freedom
## Multiple R-squared: 0.348, Adjusted R-squared: 0.345
## F-statistic: 115.3 on 2 and 432 DF, p-value: < 2.2e-16
# Running Model 3 by including more covariates: night light intensity, paternal education, SC/ST share, teacher-student ratio, school quality, and household characteristics
model_3 <- lm(
avg_child_edu ~ avg_maternal_edu + female_lfp +
viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
district_tsr + avg_school_quality_index + avg_household_characteristics_index,
data = merged2_clean
)
summary(model_3)
##
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp +
## viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
## district_tsr + avg_school_quality_index + avg_household_characteristics_index,
## data = merged2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.215157 -0.042040 0.002236 0.042159 0.179477
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.751e-01 3.580e-02 7.686 1.06e-13 ***
## avg_maternal_edu 3.074e-02 4.848e-03 6.342 5.82e-10 ***
## female_lfp 1.273e-03 3.496e-04 3.642 0.000304 ***
## viirs_annual_mean -5.024e-03 2.545e-03 -1.974 0.049033 *
## avg_paternal_edu -6.881e-03 5.121e-03 -1.344 0.179801
## SC_share -2.781e-04 4.541e-04 -0.612 0.540618
## ST_share -1.423e-05 1.925e-04 -0.074 0.941114
## district_tsr -1.721e-05 3.795e-05 -0.453 0.650452
## avg_school_quality_index 2.580e-02 2.359e-02 1.094 0.274690
## avg_household_characteristics_index 2.421e-01 2.551e-02 9.490 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06511 on 425 degrees of freedom
## Multiple R-squared: 0.5354, Adjusted R-squared: 0.5256
## F-statistic: 54.42 on 9 and 425 DF, p-value: < 2.2e-16
# Running Model 4 by further adding the district-level development index to the previous full model
model_4 <- lm(
avg_child_edu ~ avg_maternal_edu + female_lfp +
viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
district_tsr + avg_school_quality_index + avg_household_characteristics_index + avg_district_characteristics_index,
data = merged2_clean
)
summary(model_4)
##
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp +
## viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
## district_tsr + avg_school_quality_index + avg_household_characteristics_index +
## avg_district_characteristics_index, data = merged2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.213758 -0.041057 0.001879 0.041641 0.180013
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.735e-01 3.585e-02 7.627 1.60e-13 ***
## avg_maternal_edu 3.154e-02 4.929e-03 6.398 4.17e-10 ***
## female_lfp 1.249e-03 3.508e-04 3.561 0.000412 ***
## viirs_annual_mean -5.212e-03 2.555e-03 -2.040 0.041924 *
## avg_paternal_edu -6.773e-03 5.124e-03 -1.322 0.186899
## SC_share -3.457e-04 4.604e-04 -0.751 0.453101
## ST_share -3.742e-05 1.943e-04 -0.193 0.847361
## district_tsr -1.461e-05 3.807e-05 -0.384 0.701312
## avg_school_quality_index 3.483e-02 2.565e-02 1.358 0.175247
## avg_household_characteristics_index 2.504e-01 2.714e-02 9.226 < 2e-16 ***
## avg_district_characteristics_index -3.115e-02 3.472e-02 -0.897 0.370120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06513 on 424 degrees of freedom
## Multiple R-squared: 0.5363, Adjusted R-squared: 0.5254
## F-statistic: 49.04 on 10 and 424 DF, p-value: < 2.2e-16
# Loading the texreg library and displaying all four models side-by-side. specifying to round off at 4 digits
library(texreg)
## Version: 1.39.4
## Date: 2024-07-23
## Author: Philip Leifeld (University of Manchester)
##
## Consider submitting praise using the praise or praise_interactive functions.
## Please cite the JSS article in your publications -- see citation("texreg").
##
## Attaching package: 'texreg'
## The following object is masked from 'package:tidyr':
##
## extract
screenreg(
list(model_1, model_2, model_3, model_4),
column.labels = c("Model 1", "Model 2", "Model 3", "Model 4"),
digits = 4
)
##
## ===========================================================================================
## Model 1 Model 2 Model 3 Model 4
## -------------------------------------------------------------------------------------------
## (Intercept) 0.2882 *** 0.2084 *** 0.2751 *** 0.2735 ***
## (0.0266) (0.0286) (0.0358) (0.0359)
## avg_maternal_edu 0.0471 *** 0.0502 *** 0.0307 *** 0.0315 ***
## (0.0035) (0.0034) (0.0048) (0.0049)
## female_lfp 0.0020 *** 0.0013 *** 0.0012 ***
## (0.0003) (0.0003) (0.0004)
## viirs_annual_mean -0.0050 * -0.0052 *
## (0.0025) (0.0026)
## avg_paternal_edu -0.0069 -0.0068
## (0.0051) (0.0051)
## SC_share -0.0003 -0.0003
## (0.0005) (0.0005)
## ST_share -0.0000 -0.0000
## (0.0002) (0.0002)
## district_tsr -0.0000 -0.0000
## (0.0000) (0.0000)
## avg_school_quality_index 0.0258 0.0348
## (0.0236) (0.0257)
## avg_household_characteristics_index 0.2421 *** 0.2504 ***
## (0.0255) (0.0271)
## avg_district_characteristics_index -0.0311
## (0.0347)
## -------------------------------------------------------------------------------------------
## R^2 0.2898 0.3480 0.5354 0.5363
## Adj. R^2 0.2882 0.3450 0.5256 0.5254
## Num. obs. 435 435 435 435
## ===========================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
# Load the package
library(lfe)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
# Run the fixed effects model
model_a <- felm(avg_child_edu ~ avg_maternal_edu + female_lfp + viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
district_tsr + avg_school_quality_index + avg_household_characteristics_index + avg_district_characteristics_index |
state_name, # Fixed effect (e.g., state)
data = merged2_clean)
summary(model_a)
##
## Call:
## felm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp + viirs_annual_mean + avg_paternal_edu + SC_share + ST_share + district_tsr + avg_school_quality_index + avg_household_characteristics_index + avg_district_characteristics_index | state_name, data = merged2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18293 -0.03278 -0.00180 0.03729 0.16111
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## avg_maternal_edu 1.989e-02 6.016e-03 3.306 0.00103 **
## female_lfp 1.193e-03 4.315e-04 2.765 0.00597 **
## viirs_annual_mean -4.710e-03 2.378e-03 -1.981 0.04828 *
## avg_paternal_edu 8.971e-03 5.772e-03 1.554 0.12090
## SC_share -4.103e-04 5.445e-04 -0.754 0.45157
## ST_share -3.341e-04 2.421e-04 -1.380 0.16836
## district_tsr 2.557e-05 4.278e-05 0.598 0.55028
## avg_school_quality_index 1.450e-01 3.508e-02 4.134 4.35e-05 ***
## avg_household_characteristics_index 2.296e-01 3.595e-02 6.387 4.74e-10 ***
## avg_district_characteristics_index -6.441e-02 4.079e-02 -1.579 0.11511
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05378 on 396 degrees of freedom
## Multiple R-squared(full model): 0.7047 Adjusted R-squared: 0.6763
## Multiple R-squared(proj model): 0.335 Adjusted R-squared: 0.2712
## F-statistic(full model):24.86 on 38 and 396 DF, p-value: < 2.2e-16
## F-statistic(proj model): 19.95 on 10 and 396 DF, p-value: < 2.2e-16
list(model_1, model_2, model_3, model_4)
## [[1]]
##
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu, data = merged2_clean)
##
## Coefficients:
## (Intercept) avg_maternal_edu
## 0.28818 0.04714
##
##
## [[2]]
##
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp, data = merged2_clean)
##
## Coefficients:
## (Intercept) avg_maternal_edu female_lfp
## 0.208425 0.050220 0.001986
##
##
## [[3]]
##
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp +
## viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
## district_tsr + avg_school_quality_index + avg_household_characteristics_index,
## data = merged2_clean)
##
## Coefficients:
## (Intercept) avg_maternal_edu
## 2.751e-01 3.074e-02
## female_lfp viirs_annual_mean
## 1.273e-03 -5.024e-03
## avg_paternal_edu SC_share
## -6.881e-03 -2.781e-04
## ST_share district_tsr
## -1.423e-05 -1.721e-05
## avg_school_quality_index avg_household_characteristics_index
## 2.580e-02 2.421e-01
##
##
## [[4]]
##
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp +
## viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
## district_tsr + avg_school_quality_index + avg_household_characteristics_index +
## avg_district_characteristics_index, data = merged2_clean)
##
## Coefficients:
## (Intercept) avg_maternal_edu
## 2.735e-01 3.154e-02
## female_lfp viirs_annual_mean
## 1.249e-03 -5.212e-03
## avg_paternal_edu SC_share
## -6.773e-03 -3.457e-04
## ST_share district_tsr
## -3.742e-05 -1.461e-05
## avg_school_quality_index avg_household_characteristics_index
## 3.483e-02 2.504e-01
## avg_district_characteristics_index
## -3.115e-02
library(texreg)
screenreg(
list(model_a), # Supplying a list of one or more models for side-by-side display
type = "latex",
digits = 4, # Setting the number of decimal places being shown in the output to 4
single.row = TRUE, # Displaying coefficients and standard errors in a single row for compactness
custom.model.names = c("Model A"),
custom.coef.names = c( # # Renaming coefficients for clearer interpretation
"Maternal Education",
"Female LFPR",
"Night Time Lights",
"Paternal Education",
"SCs",
"STs",
"District TSR",
"School Quality Index",
"HH Characteristics Index",
"District Characteristics Index"
),
caption = "Regression Results"
)
##
## =====================================================
## Model A
## -----------------------------------------------------
## Maternal Education 0.0199 (0.0060) **
## Female LFPR 0.0012 (0.0004) **
## Night Time Lights -0.0047 (0.0024) *
## Paternal Education 0.0090 (0.0058)
## SCs -0.0004 (0.0005)
## STs -0.0003 (0.0002)
## District TSR 0.0000 (0.0000)
## School Quality Index 0.1450 (0.0351) ***
## HH Characteristics Index 0.2296 (0.0359) ***
## District Characteristics Index -0.0644 (0.0408)
## -----------------------------------------------------
## Num. obs. 435
## R^2 (full model) 0.7047
## R^2 (proj model) 0.3350
## Adj. R^2 (full model) 0.6763
## Adj. R^2 (proj model) 0.2712
## Num. groups: state_name 29
## =====================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
# Load necessary libraries
library(lfe) # For felm
library(dplyr) # For data manipulation
library(texreg) # For regression tables
# Define state lists with names in lowercase, using spaces to match merged2_clean
north_states <- c("jammu kashmir", "himachal pradesh", "punjab", "uttarakhand", "haryana", "delhi", "uttar pradesh", "chandigarh")
south_states <- c("andhra pradesh", "telangana", "karnataka", "kerala", "tamil nadu", "puducherry")
east_states <- c("bihar", "jharkhand", "odisha", "west bengal")
west_states <- c("rajasthan", "gujarat", "maharashtra", "goa", "dadra nagar haveli daman diu", "daman diu")
northeast_states <- c("arunachal pradesh", "nagaland", "manipur", "mizoram", "tripura", "meghalaya", "assam")
central_states <- c("chhattisgarh", "madhya pradesh")
# Combine into a named list
indian_regions <- list(
North = north_states,
South = south_states,
East = east_states,
West = west_states,
Northeast = northeast_states,
Central = central_states
)
# Function to run regression for a region
run_region_model <- function(region, data, region_states) {
# Filter data for the region
region_data <- data %>% filter(state_name %in% region_states)
# Check if there is data
if (nrow(region_data) > 0) {
# Run fixed effects model
model <- felm(
avg_child_edu ~ avg_maternal_edu + female_lfp + viirs_annual_mean +
avg_paternal_edu + SC_share + ST_share + district_tsr +
avg_school_quality_index + avg_household_characteristics_index +
avg_district_characteristics_index | state_name,
data = region_data
)
return(model)
} else {
warning(paste("No data for region:", region))
return(NULL)
}
}
# Run models for all regions using lapply
model_region_fe <- lapply(names(indian_regions), function(region) {
run_region_model(region, merged2_clean, indian_regions[[region]])
})
# Remove NULL entries (regions with no data)
model_region_fe <- model_region_fe[!sapply(model_region_fe, is.null)]
# Name= Name models for screenreg
names(model_region_fe) <- names(indian_regions)[!sapply(model_region_fe, is.null)]
# Display results using screenreg
screenreg(
model_region_fe,
type = "latex",
digits = 4,
single.row = TRUE,
custom.model.names = names(model_region_fe),
custom.coef.names = c(
"Maternal Education",
"Female LFPR",
"Night Time Lights",
"Paternal Education",
"SCs",
"STs",
"District TSR",
"School Quality Index",
"HH Characteristics Index",
"District Characteristics Index"
),
caption = "Regional Fixed Effects Regression Results"
)
##
## ==========================================================================================================================================================
## North South East West Northeast Central
## ----------------------------------------------------------------------------------------------------------------------------------------------------------
## Maternal Education 0.0253 (0.0095) ** 0.0109 (0.0154) 0.0039 (0.0197) 0.0109 (0.0134) 0.0087 (0.0244) 0.0146 (0.0183)
## Female LFPR 0.0030 (0.0006) *** 0.0009 (0.0011) -0.0021 (0.0014) 0.0006 (0.0013) -0.0003 (0.0016) 0.0032 (0.0014) *
## Night Time Lights -0.0039 (0.0032) -0.0023 (0.0028) -0.0189 (0.0162) -0.0026 (0.0161) 0.0147 (0.0347) 0.0091 (0.0273)
## Paternal Education 0.0285 (0.0108) ** -0.0125 (0.0123) 0.0321 (0.0195) 0.0114 (0.0124) 0.0304 (0.0207) -0.0033 (0.0143)
## SCs 0.0004 (0.0007) -0.0023 (0.0011) * 0.0006 (0.0012) 0.0000 (0.0018) 0.0013 (0.0045) -0.0053 (0.0031)
## STs 0.0007 (0.0004) -0.0008 (0.0013) 0.0003 (0.0009) -0.0006 (0.0007) 0.0008 (0.0007) -0.0023 (0.0009) *
## District TSR -0.0001 (0.0001) -0.0026 (0.0009) ** 0.0002 (0.0001) * -0.0011 (0.0005) * 0.0004 (0.0002) -0.0000 (0.0001)
## School Quality Index 0.1028 (0.0608) 0.1793 (0.1101) 0.1365 (0.0791) 0.0467 (0.1304) 0.0905 (0.0842) 0.2144 (0.1089)
## HH Characteristics Index 0.1742 (0.0562) ** 0.5030 (0.1536) ** 0.1626 (0.0973) 0.1763 (0.1098) 0.2689 (0.1108) * 0.0198 (0.1134)
## District Characteristics Index -0.0039 (0.0613) -0.1932 (0.0791) * 0.2052 (0.1351) 0.0078 (0.1099) -0.0425 (0.1166) -0.0099 (0.1564)
## ----------------------------------------------------------------------------------------------------------------------------------------------------------
## Num. obs. 108 79 70 76 58 44
## R^2 (full model) 0.9008 0.6540 0.6426 0.5850 0.6958 0.5605
## R^2 (proj model) 0.6805 0.4655 0.5152 0.4564 0.4026 0.4014
## Adj. R^2 (full model) 0.8846 0.5783 0.5596 0.4898 0.5770 0.4095
## Adj. R^2 (proj model) 0.6284 0.3485 0.4026 0.3317 0.1695 0.1956
## Num. groups: state_name 6 5 4 5 7 2
## ==========================================================================================================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
# Combine the correlated variables into a matrix
pca_data <- merged2_clean[, c("avg_maternal_edu", "avg_paternal_edu")]
# Apply PCA
pca_result <- prcomp(pca_data, scale. = TRUE)
# Add the first principal component as a new variable
merged2_clean$pca_education <- pca_result$x[, 1]
# Now you can use "pca_education" in your regression model
model <- lm(
avg_child_edu ~ avg_maternal_edu + female_lfp +
viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
district_tsr + avg_school_quality_index +
avg_household_characteristics_index + avg_district_characteristics_index,
data = merged2_clean
)
# Check summary and coefficients
summary(model)
##
## Call:
## lm(formula = avg_child_edu ~ avg_maternal_edu + female_lfp +
## viirs_annual_mean + avg_paternal_edu + SC_share + ST_share +
## district_tsr + avg_school_quality_index + avg_household_characteristics_index +
## avg_district_characteristics_index, data = merged2_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.213758 -0.041057 0.001879 0.041641 0.180013
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.735e-01 3.585e-02 7.627 1.60e-13 ***
## avg_maternal_edu 3.154e-02 4.929e-03 6.398 4.17e-10 ***
## female_lfp 1.249e-03 3.508e-04 3.561 0.000412 ***
## viirs_annual_mean -5.212e-03 2.555e-03 -2.040 0.041924 *
## avg_paternal_edu -6.773e-03 5.124e-03 -1.322 0.186899
## SC_share -3.457e-04 4.604e-04 -0.751 0.453101
## ST_share -3.742e-05 1.943e-04 -0.193 0.847361
## district_tsr -1.461e-05 3.807e-05 -0.384 0.701312
## avg_school_quality_index 3.483e-02 2.565e-02 1.358 0.175247
## avg_household_characteristics_index 2.504e-01 2.714e-02 9.226 < 2e-16 ***
## avg_district_characteristics_index -3.115e-02 3.472e-02 -0.897 0.370120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06513 on 424 degrees of freedom
## Multiple R-squared: 0.5363, Adjusted R-squared: 0.5254
## F-statistic: 49.04 on 10 and 424 DF, p-value: < 2.2e-16
library(texreg)
screenreg(model,
digits = 3,
single.row = TRUE,
stars = c(0.01, 0.05, 0.1),
custom.coef.names = c("Intercept",
"Avg. Maternal Education",
"Female LFP",
"Night Lights (VIIRS)",
"Avg. Paternal Education",
"SC Share",
"ST Share",
"TSR Score",
"School Quality Index",
"Household Characteristics Index",
"District Characteristics Index"))
##
## ====================================================
## Model 1
## ----------------------------------------------------
## Intercept 0.273 (0.036) ***
## Avg. Maternal Education 0.032 (0.005) ***
## Female LFP 0.001 (0.000) ***
## Night Lights (VIIRS) -0.005 (0.003) **
## Avg. Paternal Education -0.007 (0.005)
## SC Share -0.000 (0.000)
## ST Share -0.000 (0.000)
## TSR Score -0.000 (0.000)
## School Quality Index 0.035 (0.026)
## Household Characteristics Index 0.250 (0.027) ***
## District Characteristics Index -0.031 (0.035)
## ----------------------------------------------------
## R^2 0.536
## Adj. R^2 0.525
## Num. obs. 435
## ====================================================
## *** p < 0.01; ** p < 0.05; * p < 0.1
merged2_clean$predicted_values <- predict(model_4)
# Calculate residuals (observed - predicted)
merged2_clean$residuals <- merged2_clean$avg_arithmetic - merged2_clean$predicted_values
# Create a plot
plot(merged2_clean$predicted_values, merged2_clean$residuals,
main="Residuals vs Predicted",
xlab="Predicted Values", ylab="Residuals")
# Add a horizontal line at 0 for better visualization
abline(h = 0, col = "red")
# Highlight outliers (e.g., define an outlier if residuals are > 3 standard deviations)
outliers <- merged2_clean[abs(merged2_clean$residuals) > 3*sd(merged2_clean$residuals), ]
# Add labels for outliers
text(outliers$predicted_values, outliers$residuals,
labels = rownames(outliers), pos = 4, col = "blue")
# Correlational Matrix
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Selecting a subset of variables from merged2_clean
new_Df <- merged2_clean %>%
select(female_lfp:avg_district_characteristics_index)
# Converting all columns to numeric where possible
numeric_data <- data.frame(lapply(new_Df, function(x) {
if (is.numeric(x)) return(x)
suppressWarnings(as.numeric(as.character(x)))
}))
# Identifying columns that have zero variance
zero_var_cols <- sapply(numeric_data, function(x) sd(x, na.rm = TRUE) == 0)
names(zero_var_cols[zero_var_cols == TRUE]) # Show column names with zero variance
## character(0)
# Removing columns that became entirely NA after conversion
numeric_data <- numeric_data[, colSums(!is.na(numeric_data)) > 0]
# computing correlation
cor_matrix <- cor(numeric_data, use = "complete.obs")
# Melting the correlation matrix for ggplot2 compatibility
melted_cor <- melt(cor_matrix)
# Creating a heatmap with colored tiles and correlation values as text
heatmap <- ggplot(melted_cor, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
geom_text(aes(label = round(value, 2)), size = 4, color = "black") +
scale_fill_gradient2(low = "darkblue", high = "red", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab",
name = "Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 10, hjust = 1)) +
coord_fixed()
#displaying the map
heatmap
# Saving the heatmap as a PNG file
ggsave("plot.png", plot = heatmap, width = 8, height = 6)
# Read in India shapefile (adjust path or use geojson)
dist_shp <- st_read("/Users/Arunima/Desktop/D3SR/shrug-pc11dist-poly-shp/district.shp") # or use a geojson
## Reading layer `district' from data source
## `/Users/Arunima/Desktop/D3SR/shrug-pc11dist-poly-shp/district.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 641 features and 3 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 68.13327 ymin: 6.754878 xmax: 97.40441 ymax: 37.07827
## Geodetic CRS: WGS 84
#renmaing colum to match for merging
dist_shp <- dist_shp %>%
rename(pc11_district_id = pc11_d_id)
dist_shp <- dist_shp %>%
mutate(pc11_district_id = as.integer(pc11_district_id))
merged2_clean <- merged2_clean %>%
mutate(pc11_district_id = as.integer(pc11_district_id))
# Joining map with data
dist_shp <- dist_shp %>%
left_join(merged2_clean, by = "pc11_district_id")
#plotting the map for avergae child education over India
ggplot(dist_shp) +
geom_sf(aes(fill = avg_child_edu), color = NA) +
scale_fill_viridis_c(name = "Avg. Child Outcome", na.value = "black") +
theme_minimal() +
labs(
title = "Average Children's Education Level by District",
caption = "Data Source: ASER + 2011 Census"
)
#Box Plots
library(ggplot2)
library(dplyr)
# Bin avg_mother_edu into categories
merged2_clean <- merged2_clean %>%
mutate(mother_edu_bin = cut(avg_maternal_edu,
breaks = quantile(avg_maternal_edu, probs = seq(0, 1, 0.2), na.rm = TRUE),
include.lowest = TRUE,
labels = c("Very Low", "Low", "Medium", "High", "Very High")))
# Plot the boxplot
ggplot(merged2_clean, aes(x = mother_edu_bin, y = avg_child_edu)) +
geom_boxplot(fill = "skyblue") +
labs(
title = "Boxplot of Average Child Education by Mother's Education Level",
x = "Mother's Education (Binned)",
y = "Average Years of Child Education"
) +
theme_minimal()
# Bin female_lfp
merged2_clean <- merged2_clean %>%
mutate(female_lfp_bin = cut(female_lfp,
breaks = quantile(female_lfp, probs = seq(0, 1, 0.2), na.rm = TRUE),
include.lowest = TRUE,
labels = c("Very Low", "Low", "Medium", "High", "Very High")))
# Plot
ggplot(merged2_clean, aes(x = female_lfp_bin, y = avg_child_edu)) +
geom_boxplot(fill = "lightgreen") +
labs(
title = "Boxplot of Average Child Education by Female Labour Force Participation",
x = "Female Labour Force Participation (Binned)",
y = "Average Years of Child Education"
) +
theme_minimal()
# Boxplot of avg_child_edu by District
ggplot(merged2_clean, aes(x = "", y = avg_child_edu)) +
geom_boxplot(fill = "cyan", width = 0.2) + # reduce width here
labs(title = "Distribution of Average Child Education Across Districts",
y = "Average Years of Child Education",
x = "") +
theme_minimal()
# Normalize the district index to range from 0 to 1
merged2_clean$district_index <- scales::rescale(as.numeric(factor(merged2_clean$district_name)))
# Identify extremes
extremes <- merged2_clean %>%
filter(avg_child_edu == min(avg_child_edu, na.rm = TRUE) |
avg_child_edu == max(avg_child_edu, na.rm = TRUE))
# Remove extremes from merged2_clean for plotting
merged2_clean_no_extremes <- merged2_clean %>%
filter(!(avg_child_edu %in% extremes$avg_child_edu))
# Scatterplot (excluding extremes)
ggplot(merged2_clean_no_extremes, aes(x = avg_maternal_edu, y = avg_child_edu)) +
geom_point(alpha = 0.6, color = "black") +
geom_point(data = extremes, aes(x = district_index, y = avg_child_edu), color = "red", size = 3) +
geom_smooth(method = "lm", se = FALSE, color = "red", linetype = "solid") + # Adding the trend line
labs(title = "Scatterplot of Average Child Education Across Districts",
x = "District Index (Normalized)",
y = "Average Years of Child Education") +
scale_x_continuous(limits = c(4.5, NA)) + # Set x-axis to start at 4.5
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Load required packages
library(ggplot2)
library(dplyr)
library(sf) # For maps
library(ggthemes)
library(RColorBrewer)
library(cowplot) # For combining plots
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggthemes':
##
## theme_map
## The following object is masked from 'package:lubridate':
##
## stamp
library(viridis) # Color palettes
## Loading required package: viridisLite
library(ggpubr) # For regression lines
## Registered S3 method overwritten by 'broom':
## method from
## nobs.felm lfe
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:cowplot':
##
## get_legend
library(tidyr)
child_edu_mother_edu <- ggplot(merged2, aes(x = avg_maternal_edu, y = avg_child_edu)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(title = "Mother's Education vs Child's Composite Test Score",
x = "Mother's Years of Schooling",
y = "Child Test Score") +
theme_minimal()
child_edu_mother_edu
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 205 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 205 rows containing missing values or values outside the scale range
## (`geom_point()`).
read_mother_edu <- ggplot(merged2, aes(x = avg_maternal_edu, y = avg_reading)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(title = "Mother's Education vs Child's Reading Test Score",
x = "Mother's Years of Schooling",
y = "Child Test Score") +
theme_minimal()
read_mother_edu
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 205 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 205 rows containing missing values or values outside the scale range
## (`geom_point()`).
math_mother_edu <- ggplot(merged2, aes(x = avg_maternal_edu, y = avg_arithmetic)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(title = "Mother's Education vs Child's Arithmetic Test Score",
x = "Mother's Years of Schooling",
y = "Child Test Score") +
theme_minimal()
math_mother_edu
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 205 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Removed 205 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Ensure the district ID in merged2 is a character with leading zeros to match the shapefile
merged2 <- merged2 %>%
mutate(pc11_district_id = sprintf("%03d", as.integer(pc11_district_id)))
#mapping mothers level of education
summary(merged2_clean)
## pc11_state_id pc11_district_id pc11_pca_tot_p pc11_pca_p_sc
## Min. : 1.00 Min. : 1.0 Min. : 8004 Min. : 0
## 1st Qu.: 9.00 1st Qu.:164.0 1st Qu.: 1092055 1st Qu.: 118404
## Median :18.00 Median :318.0 Median : 1847023 Median : 292144
## Mean :17.33 Mean :324.8 Mean : 2161090 Mean : 361354
## 3rd Qu.:24.50 3rd Qu.:494.0 3rd Qu.: 2985834 3rd Qu.: 508104
## Max. :34.00 Max. :637.0 Max. :11060148 Max. :2168304
## pc11_pca_p_st pc11_pca_tot_f pc11_pca_f_lit pc11_pca_tot_work_p
## Min. : 0 Min. : 3590 Min. : 1822 Min. : 3555
## 1st Qu.: 8845 1st Qu.: 529640 1st Qu.: 269162 1st Qu.: 462379
## Median : 67180 Median : 893164 Median : 491775 Median : 751930
## Mean : 175871 Mean :1051181 Mean : 584051 Mean : 861377
## 3rd Qu.: 231888 3rd Qu.:1427611 3rd Qu.: 792352 3rd Qu.:1149398
## Max. :1564369 Max. :5195070 Max. :3635765 Max. :4492767
## pc11_pca_tot_work_f pc11_pca_non_work_f pc11_pca_non_work_p pc11_pca_no_hh
## Min. : 1327 Min. : 2263 Min. : 4449 Min. : 1952
## 1st Qu.: 122845 1st Qu.: 372393 1st Qu.: 643450 1st Qu.: 227110
## Median : 234266 Median : 658186 Median :1097721 Median : 375873
## Mean : 272066 Mean : 779115 Mean :1299713 Mean : 445447
## 3rd Qu.: 361767 3rd Qu.:1060028 3rd Qu.:1751370 3rd Qu.: 602288
## Max. :1239177 Max. :4262784 Max. :6567381 Max. :2529165
## district_name state_name female_lfp SC_share
## Length:435 Length:435 Min. : 7.631 Min. : 0.000
## Class :character Class :character 1st Qu.:18.682 1st Qu.: 8.492
## Mode :character Mode :character Median :28.637 Median :16.409
## Mean :28.616 Mean :15.197
## 3rd Qu.:37.806 3rd Qu.:20.865
## Max. :64.039 Max. :50.170
## ST_share avg_reading avg_arithmetic avg_child_edu
## Min. : 0.0000 Min. :0.2754 Min. :0.2440 Min. :0.2597
## 1st Qu.: 0.4449 1st Qu.:0.6005 1st Qu.:0.5386 1st Qu.:0.5722
## Median : 3.7993 Median :0.6732 Median :0.6137 Median :0.6476
## Mean :15.8327 Mean :0.6664 Mean :0.6090 Mean :0.6385
## 3rd Qu.:17.1247 3rd Qu.:0.7374 3rd Qu.:0.6838 3rd Qu.:0.7101
## Max. :97.8188 Max. :0.8731 Max. :0.8481 Max. :0.8596
## avg_enrollment avg_paternal_edu avg_maternal_edu avg_enrollment_ratio
## Min. :0.5971 Min. : 5.473 Min. : 4.645 Min. :0.5971
## 1st Qu.:0.8002 1st Qu.: 7.949 1st Qu.: 6.717 1st Qu.:0.8002
## Median :0.8514 Median : 8.556 Median : 7.351 Median :0.8514
## Mean :0.8428 Mean : 8.561 Mean : 7.431 Mean :0.8428
## 3rd Qu.:0.8932 3rd Qu.: 9.250 3rd Qu.: 7.997 3rd Qu.:0.8932
## Max. :0.9784 Max. :11.199 Max. :11.225 Max. :0.9784
## avg_dropout avg_mother_class avg_household_characteristics_index
## Min. :0.8755 Min. : 4.645 Min. :0.1649
## 1st Qu.:0.9645 1st Qu.: 6.717 1st Qu.:0.4775
## Median :0.9815 Median : 7.351 Median :0.6373
## Mean :0.9746 Mean : 7.431 Mean :0.6304
## 3rd Qu.:0.9906 3rd Qu.: 7.997 3rd Qu.:0.8046
## Max. :1.0000 Max. :11.225 Max. :0.9555
## avg_school_quality_index childenrolment total_students total_teachers
## Min. :0.06667 Min. : 16.76 Min. : 159 Min. : 4.00
## 1st Qu.:0.47933 1st Qu.: 93.85 1st Qu.: 2504 1st Qu.: 40.00
## Median :0.60705 Median :138.37 Median : 3787 Median : 69.00
## Mean :0.58886 Mean :165.01 Mean : 4653 Mean : 83.34
## 3rd Qu.:0.70829 3rd Qu.:190.07 3rd Qu.: 5483 3rd Qu.:110.00
## Max. :0.96190 Max. :654.37 Max. :18955 Max. :358.00
## district_tsr viirs_annual_mean avg_district_characteristics_index
## Min. : 8.48 Min. : 0.000602 Min. :0.1417
## 1st Qu.: 32.55 1st Qu.: 0.353918 1st Qu.:0.3498
## Median : 46.74 Median : 0.604526 Median :0.4184
## Mean : 85.77 Mean : 0.904440 Mean :0.4386
## 3rd Qu.: 90.73 3rd Qu.: 1.055772 3rd Qu.:0.5060
## Max. :1053.50 Max. :19.192452 Max. :0.9376
## pca_education predicted_values residuals mother_edu_bin
## Min. :-4.042045 Min. :0.4735 Min. :-0.22947 Very Low :87
## 1st Qu.:-0.899107 1st Qu.:0.5806 1st Qu.:-0.07554 Low :87
## Median :-0.002101 Median :0.6373 Median :-0.02902 Medium :87
## Mean : 0.000000 Mean :0.6385 Mean :-0.02947 High :87
## 3rd Qu.: 0.963014 3rd Qu.:0.6932 3rd Qu.: 0.01371 Very High:87
## Max. : 3.713781 Max. :0.8129 Max. : 0.18805
## female_lfp_bin district_index
## Very Low :87 Min. :0.0000
## Low :87 1st Qu.:0.2477
## Medium :87 Median :0.5000
## High :87 Mean :0.4991
## Very High:87 3rd Qu.:0.7477
## Max. :1.0000
# Select and reshape
plot_data <- dist_shp %>%
select(geometry, avg_arithmetic, avg_household_characteristics_index, avg_child_edu) %>%
pivot_longer(cols = c(avg_arithmetic, avg_household_characteristics_index),
names_to = "variable", values_to = "value")
# Plot
ggplot(plot_data) +
geom_sf(aes(fill = value), color = "black", size = 0.1) +
scale_fill_viridis_c(na.value = "gray") +
facet_wrap(~ variable) +
theme_minimal() +
labs(title = "District-Level Comparison", fill = "Value")
### Overlay map
ggplot() +
# First layer: average arithmetic
geom_sf(data = dist_shp, aes(fill = avg_child_edu), color = NA, alpha = 0.6) +
# Second layer: average reading
geom_sf(data = dist_shp, aes(fill = avg_maternal_edu), color = NA, alpha = 0.6) +
# Third layer: household_characteristics_index
geom_sf(data = dist_shp, aes(fill = avg_household_characteristics_index), color = NA, alpha = 0.5) +
scale_fill_viridis_c(na.value = "gray") +
theme_minimal() +
labs(title = "Overlay: Child Education, Maternal Education & Household Index") +
theme(legend.position = "none")
library(patchwork) # For side-by-side plots
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
##
## align_plots
p1 <- ggplot(dist_shp) +
geom_sf(aes(fill = avg_maternal_edu), color = "black", size = 0.1) +
scale_fill_viridis_c(option = "viridis", na.value = "gray") +
theme_minimal() +
labs(title = "District-wise Average Years of Maternal Education", fill = "Score")
p2 <- ggplot(dist_shp) +
geom_sf(aes(fill = female_lfp), color = "black", size = 0.1) +
scale_fill_viridis_c(option = "viridis", na.value = "gray") +
theme_minimal() +
labs(title = "District-wise Female Labor Force Participation Rate", fill = "Score")
# Display side by side
p1 + p2
#map to plot average child education with hiusehold characteristics
ggplot() +
geom_sf(data = dist_shp, aes(fill = avg_child_edu), color = "grey90") +
geom_sf(data = dist_shp %>% filter(avg_household_characteristics_index > 0.5),
fill = NA, color = "black", size = 0.4) +
scale_fill_viridis_c(na.value = "gray") +
theme_minimal() +
labs(title = "Average Child Education with HH Index")
merged2_clean <- merged2_clean %>%
mutate(zone = case_when(
state_name %in% c("jammu kashmir", "himachal pradesh", "punjab", "uttarakhand", "haryana", "delhi", "uttar pradesh", "chandigarh") ~ "North",
state_name %in% c("andhra pradesh", "telangana", "karnataka", "kerala", "tamil nadu", "puducherry") ~ "South",
state_name %in% c("bihar", "jharkhand", "odisha", "west bengal") ~ "East",
state_name %in% c("rajasthan", "gujarat", "maharashtra", "goa", "dadra nagar haveli daman diu", "daman diu") ~ "West",
state_name %in% c("arunachal pradesh", "nagaland", "manipur", "mizoram", "tripura", "meghalaya", "assam") ~ "Northeast",
state_name %in% c("chhattisgarh", "madhya pradesh") ~ "Central",
TRUE ~ NA_character_
))
viirs_cutoff <- quantile(merged2_clean$viirs_annual_mean, 0.95, na.rm = TRUE)
# Step 2: Add urban_dummy to merged2_clean itself
merged2_clean <- merged2_clean %>%
mutate(urban_dummy = ifelse(viirs_annual_mean >= viirs_cutoff, 1, 0)) %>%
mutate(urban_label = ifelse(urban_dummy == 1, "Urban", "Rural"))
merged2_clean <- merged2_clean %>%
mutate(
urban_label = ifelse(urban_dummy == 1, "Urban", "Rural")
)
merged2_clean <- merged2_clean %>%
filter(!is.na(zone))
ggplot(merged2_clean, aes(x = zone, y = avg_child_edu, fill = urban_label)) +
geom_boxplot(position = position_dodge(0.8), width = 0.6) +
labs(
title = "Child Education by Region and Urban/Rural Classification",
x = "Region",
y = "Average Arithmetic Score",
fill = "Location Type"
) +
theme_minimal(base_size = 13) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
merged2_clean <- merged2_clean %>%
mutate(zone = case_when(
state_name %in% c("jammu kashmir", "himachal pradesh", "punjab", "uttarakhand", "haryana", "delhi", "uttar pradesh", "chandigarh") ~ "North",
state_name %in% c("andhra pradesh", "telangana", "karnataka", "kerala", "tamil nadu", "puducherry") ~ "South",
state_name %in% c("bihar", "jharkhand", "odisha", "west bengal") ~ "East",
state_name %in% c("rajasthan", "gujarat", "maharashtra", "goa", "dadra nagar haveli daman diu", "daman diu") ~ "West",
state_name %in% c("arunachal pradesh", "nagaland", "manipur", "mizoram", "tripura", "meghalaya", "assam") ~ "Northeast",
state_name %in% c("chhattisgarh", "madhya pradesh") ~ "Central",
TRUE ~ NA_character_
))
library(ggplot2)
ggplot(merged2_clean, aes(x = avg_maternal_edu, y = avg_child_edu, color = zone)) +
geom_point(size = 2, alpha = 0.7) +
scale_color_brewer(palette = "Set1", na.value = "grey") +
labs(
title = "Average Child Education vs. Average Maternal Education by Zone",
x = "Average Maternal Education (Years)",
y = "Average Child Education (Score)",
color = "Zone"
) +
theme_minimal()
The End
Thank you!